[FRAUD] 데이터 정리 시도(8.18-망함 df50을 tr/test로 분리했다가 다시 합쳐봄..)

Author

김보람

Published

August 18, 2023

imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import torch

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부     
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G


def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G
    
    
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def embedding(Graph):
    # Graph -> X (feature)
    _edgs = list(Graph.edges)
    subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
    subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))    
    embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
    X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
    # Graph -> y (label)
    y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
    return X,y 

def anal(df):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=100, random_state=42) 
    lrnr.fit(X,y)
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({
        'acc':[sklearn.metrics.accuracy_score(yy,yyhat)], 
        'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
        'rec':[sklearn.metrics.recall_score(yy,yyhat)],
        'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
    )    
    return df

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

시도

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
(214520, 22)
df50 = down_sample_textbook(df02)
df50.shape
(12012, 22)
12012*12012
144288144

고려할 것(230810)

  • df50 의 shape이 12000개 이므로 9000개의 T, 3000개의 F를 train mask로 만들자.

  • 고객정보가 동일하면 edge를 1로, 아니면 0으로 놓고 1에대한 weight를 만들자.

  • g(V,E,W)에서의 weight

df50 = df50.reset_index()
N = len(df50)

tr/test

df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)
df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)
(0.49828, 0.50516)
df50_tr.shape, df50_test.shape
((9009, 23), (3003, 23))
train_mask = np.concatenate((np.full(9009, True), np.full(3003, False)))
test_mask = np.concatenate((np.full(9009, False), np.full(3003, True)))
print("Train Mask:", train_mask)
print("Test Mask:", test_mask)
Train Mask: [ True  True  True ... False False False]
Test Mask: [False False False ...  True  True  True]
train_mask.shape, test_mask.shape
((12012,), (12012,))
train_mask.sum(), test_mask.sum()
(9009, 3003)
df50_com = pd.concat([df50_tr, df50_test])
df50_com = df50_com.reset_index()
df50_com
level_0 index trans_date_trans_time cc_num merchant category amt first last gender ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 476 51331 2019-01-31 00:44:00 3.543590e+15 fraud_Medhurst PLC shopping_net 921.24 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 c8928ba53be26fdd997b26f7130c757e 1327970678 40.064488 -78.210499 1
1 3671 625691 2019-09-23 00:09:00 2.610530e+15 fraud_Torphy-Goyette shopping_pos 698.28 Tanya Dickerson F ... 36.2416 -86.6117 22191 Prison officer 1994-07-27 90453290b765904ed1c3426882a6788b 1348358993 35.884288 -87.513318 1
2 6641 896244 2019-12-25 21:30:00 6.011330e+15 fraud_Monahan-Morar personal_care 220.56 Lauren Butler F ... 36.0557 -96.0602 413574 Teacher, special educational needs 1971-09-01 4072a3effcf51cf7cf88f69d00642cd9 1356471044 35.789798 -95.859736 0
3 4288 717690 2019-11-02 22:22:00 6.011380e+15 fraud_Daugherty, Pouros and Beahan shopping_pos 905.43 Martin Duarte M ... 44.6001 -84.2931 864 General practice doctor 1942-05-04 f2fa1b25eef2f43fa5c09e3e1bfe7f77 1351894926 44.652759 -84.500359 1
4 4770 815813 2019-12-08 02:50:00 4.430880e+15 fraud_Hudson-Ratke grocery_pos 307.98 Alicia Morales F ... 39.3199 -106.6596 61 Public relations account executive 1939-11-04 f06eff8da349e36e623cff026de8e970 1354935056 38.389399 -106.111026 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12007 56 12246 2019-01-08 01:50:00 4.859530e+15 fraud_Howe Ltd misc_pos 664.73 Rebecca Farley F ... 42.0716 -75.0152 1228 Psychotherapist, child 1990-02-25 61817f427fdb2a54d7c51595026021d2 1325987429 41.124211 -75.954718 1
12008 3194 538327 2019-08-18 23:37:00 5.020130e+11 fraud_Miller-Harris misc_net 844.60 Sherry Martinez F ... 42.6315 -75.1866 165 Naval architect 1945-09-20 635ba4a5f582514e053e96bf3a4376ac 1345333050 42.207966 -74.695138 1
12009 2855 459431 2019-07-22 03:18:00 4.917190e+15 fraud_Corwin-Collins gas_transport 17.97 Joel Rivera M ... 35.8759 -96.9623 1165 Psychotherapist, child 1944-11-11 f9526787905f648773a69e1f97faa017 1342927100 34.880538 -96.384044 1
12010 10690 720826 2019-11-03 21:51:00 4.997730e+15 fraud_Hagenes, Hermann and Stroman travel 7.58 Stephanie Taylor F ... 44.9913 -92.9487 753116 Fisheries officer 1971-08-06 3cd0cc36fa115887dba94c1d5b3fb2df 1351979471 44.177391 -92.998310 0
12011 2986 489045 2019-08-02 01:42:00 2.450830e+15 fraud_Herman, Treutel and Dickens misc_net 824.99 Timothy Kirby M ... 45.6040 -94.1591 16163 Hydrographic surveyor 1987-02-22 5832beb3af071da9ddd41d9ff8f7a5a1 1343871750 44.785690 -93.624590 1

12012 rows × 24 columns


이건 weight?

# edge_index_list = []
# for i in range(N):
#     for j in range(N):
#         time_difference = (df50['trans_date_trans_time'][i] - df50['trans_date_trans_time'][j]).total_seconds()
#         edge_index_list.append([i, j, time_difference])
# edge_index_list[:5]
[[0, 0, 0.0],
 [0, 1, -2460.0],
 [0, 2, -7140.0],
 [0, 3, -9120.0],
 [0, 4, -10140.0]]
# np.save('edge_index_list_50.npy', edge_index_list)

# loaded_data = np.load('edge_index_list_50.npy')
# edge_index = np.array(edge_index_list)
# edge_index[:,2] = np.abs(edge_index[:,2])
# theta = edge_index[:,2].mean()
# theta
12238996.895508753
# edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
# edge_index
array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 9.99799023e-01],
       [0.00000000e+00, 2.00000000e+00, 9.99416789e-01],
       ...,
       [1.20110000e+04, 1.20090000e+04, 4.19756312e-01],
       [1.20110000e+04, 1.20100000e+04, 2.26811434e-01],
       [1.20110000e+04, 1.20110000e+04, 0.00000000e+00]])
# edge_index[:,2]
array([0.        , 0.99979902, 0.99941679, ..., 0.41975631, 0.22681143,
       0.        ])

Q. 그런데 밑에서 random으로 train하고 test로 나누게 되면.. wieght랑 edge를 어떻게 적용시키지?

edge: 같은 cc_num이면 edge=1, 다르면 edge=0

edge_index_list2_com = []
for i in range(N):
    for j in range(N):
        if df50_com['cc_num'][i] != df50_com['cc_num'][j]:  
            edge = 0
        else:
            edge = 1
        edge_index_list2_com.append([i, j, edge])
np.save('edge_index_list2_50_com.npy', edge_index_list2_com)

loaded_data = np.load('edge_index_list2_50_com.npy')
edge_index_list2_com[:5]
[[0, 0, 1], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0]]
edge_one_com = [(i, j) for i, j, edge in edge_index_list2_com if edge == 1]
edge_one_com[:5]
[(0, 0), (0, 344), (0, 1377), (0, 1447), (0, 1639)]
len(edge_one_com)
200706
edge_one_index_com = torch.tensor(edge_one_com, dtype=torch.long).t()
edge_one_index_com.shape
torch.Size([2, 200706])

data설정(x, edge_index, y)

x = df50_com['amt']
a = torch.tensor(x, dtype=torch.float)
a = a.reshape(-1,1)
a
tensor([[921.2400],
        [698.2800],
        [220.5600],
        ...,
        [ 17.9700],
        [  7.5800],
        [824.9900]])
y = df50_com['is_fraud']
b = torch.tensor(y,dtype=torch.int64)
b
tensor([1, 1, 0,  ..., 1, 0, 1])
import torch_geometric
data = torch_geometric.data.Data(x=a, edge_index = edge_one_index_com, y=b, train_mask = train_mask, test_mask = test_mask)
data
Data(x=[12012, 1], edge_index=[2, 200706], y=[12012], train_mask=[12012], test_mask=[12012])


gnn



import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
model = GCN()
model
GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')
Accuracy: 0.5012

왜 자꾸 accuracy가 안좋게 나오노

data.y[data.test_mask].sum()
tensor(1517)
out[data.train_mask]
tensor([[-1.0039, -0.4564],
        [-1.6018, -0.2251],
        [-0.3896, -1.1311],
        ...,
        [-1.7755, -0.1856],
        [-0.9515, -0.4880],
        [-2.3393, -0.1014]], grad_fn=<IndexBackward0>)
data.y[data.test_mask]
tensor([0, 1, 0,  ..., 1, 0, 1])

음……………. edge_list를 다시 해보자.

!!!! 첫번째 시도에서는 edge_list를 무작정 1인걸 고른것이 아니였는데 왜 여기서는 이러고 있엇니……………..